Genome-resolved metagenomics on anaerobic enrichments from deep subsurface groundwaters

Author

george.westmeijer@umu.se

Published

May 6, 2025

Document settings

Expand code
cite_packages(output = "table", pkgs = "Session", out.dir = getwd())
    Package Version   Citation
1      base   4.4.2      @base
2 patchwork   1.3.0 @patchwork
3 tidyverse   2.0.0 @tidyverse

Read data and assign colors

The 16S ribosomal RNA data is from several sequencing projects. In chronological order, P15009 contains the groundwater samples from t0 (sampled Oct 2019), while P21015 contains the samples from t24 (sampled Feb 2020) from identical groundwaters. P21015 contains the sequencing libraries from the cultures as part of the inital experiment (one meteoric groundwater), while P26201 contains the libraries from the cultures as part of the follow-up experiment (using three groundwaters as inoculum).

All 16S libraries were processed with the bioinformatic pipeline nf-core/ampliseq (v2.12.0), using the SBDI-GTDB (R09-RS220-1) as a reference.

Expand code
seqtab <- read_tsv("data/ASV_table.tsv.gz", col_types = cols(.default = col_character(), 
                                    count = col_integer()
                                    )) %>%
  group_by(sample) %>% mutate(relab = count / sum(count)) %>% ungroup()

tax <- read_tsv("data/ASV_tax.tsv.gz", col_types = cols(.default = col_character()))
               
smd <- read_tsv("data/smd.tsv", col_types = cols(.default = col_character())) %>%
  filter(!sample %in% c("P21015_1082","P21015_1083","P21015_1084","P21015_1087","P21015_1088","P21015_1089")) %>%
  filter(fraction != "control")

This project includes 15 metagenomes from 15 unique cultures. The metagenomes contain 35 de-replicated metagenome-assembled genomes (MAG) that are affiliated with the Bacillota (7), Desulfobacterota (7), Pseudomonadota (6), Bacteroidota (4), Patescibacteria (3), Nanoarchaeota (2), Acidobacteriota (1), Campylobacterota (1), Chloroflexota (1), Iainarchaeota (1), Spirochaetota (1), and Verrucomicrobiota (1).

Figure 1: sampling-site
Expand code
coverm <- read_tsv("data/coverm.tsv.gz", col_types = cols(.default = col_character(), tpm = col_double()))

emapper <- read_tsv("data/emapper.tsv.gz", show_col_types = FALSE) 

# Sample 35 (l7f) was removed from the binning process as this sample did not produce any bins
bintax <- read_tsv("data/gtdbtk.summary.tsv", col_types = cols(.default = col_character())) 

quality <- read_tsv("data/quality_report.tsv", show_col_types = F) %>%
  rename_with(tolower) %>%
  rename(genome = name) %>%
  filter(genome %in% coverm$genome)
Expand code
cols.gw <- c(
  "KR0015" = "#8D8680",
  "SA1420" = "#A2A475",
  "SA2600" = "#C7B19C"
  )

cols.origin <- c(
  "meteoric" = "#8D8680",
  "marine" = "#A2A475",
  "saline" = "#C7B19C"
  )

cols.fraction <- c(
  "environmental" = "#FAEFD1",
  "fractionated" = "#A2A475",
  "non-fractionated" = "#1B5331"
  )

cols.mag <- c(
  "Other" = "#899DA4",
  "Nanoarchaeota" = "#046C9A",
  "Patescibacteria" = "#C7B19C",
  "Pseudomonadota" = "#972D15",
  "Desulfobacterota" = "#A2A475",
  "Bacillota" = "#1B5331"
  )

cols.phylum <- c( # Sorted on abundance
  "Patescibacteria" = "#C7B19C",
  "Desulfobacterota" = "#A2A475",
  "Chloroflexota" = "#D69C4E",
  "Atribacterota" = "#972D15",
  "Acidobacteriota" = "#FAEFD1",
  "Omnitrophota" = "#00A08A",
  "Nitrospirota" = "#D8B70A",
  "Campylobacterota" = "#1B5331",
  "Other" = "#899DA4",
  "Unidentified" = "#046C9A"
)

16S data groundwaters

Rarefaction 16S data

The 16S data from the groundwater samples are rarefied to check if there is sufficient sequencing depth to estimate microbial diversity.

Expand code
t0 <- c(
  "P15009_1057","P15009_1059","P15009_1060","P21015_1020","P21015_1021","P21015_1022", # KR0015
  "P15009_1061","P15009_1065","P15009_1066","P21015_1023","P21015_1024","P21015_1025", # SA1420
  "P15009_1069","P15009_1062","P15009_1063","P21015_1029","P21015_1030","P21015_1031"  # SA2600
       )

# Perform the subsampling
rare <- seqtab %>%
  # Remove negative control
  filter(sample %in% t0) %>%
  select(-relab) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  column_to_rownames("sample") %>%
  vegan::rarecurve(sample = 10000, step = 100, tidy = TRUE) %>%
  tibble() %>%
  rename_with(tolower)
Expand code
rare %>%
  inner_join(smd, by = c("site" = "sample")) %>%
  # One sample has ~ 1 million reads 
  filter(sample < 150000) %>%
  ggplot(aes(sample, species, group = site, colour = origin)) + 
  geom_line(linewidth = 0.75) +
  scale_color_manual(values = cols.origin, name = "Groundwater", labels = c("Meteoric", "Marine", "Saline")) +
  labs(x = "No. of sequenced reads", y = "Rarefied No. of ASVs") +
  scale_x_continuous(labels = c("0", "50.000", "100.000", "150.000")) +
  theme_tidy() +
  theme(
    panel.border = element_rect(fill = "transparent", linewidth = 0.75, colour = "#000000"),

  )
Figure 2: Rarefaction curves for the groundwater samples (t0) for each groundwater type (n = 6).
Expand code
ggsave(filename = "figures/fig-s1.pdf", width = 100, height = 80, units = "mm")

Redundancy analysis

Expand code
site <- magick::image_read("figures/äspö-mod.png")

p1 <- ggplot() + 
  annotation_custom(grid::rasterGrob(site)) + 
  theme_minimal() + coord_fixed() +
  theme(
    plot.margin = margin(), 
    aspect.ratio = 0.75
    )

As the hydrochemistry data is from a monitoring program, the sampling data closest to groundwater sampling was selected. All units are mg l-1, except for EC (mS m-1), pH, T (degrees C), and O18 (ppt SMOW)

Expand code
chem <- read_tsv("data/sicada.txt", show_col_types = FALSE) %>% 
  rename_with(tolower) %>% 
  filter(sampling) %>%
  select(groundwater = idcode, so4, feii, ph_lab, ec_lab, temp_w, doc, nh4_n, o18, s34_so4) %>%
  inner_join(smd[,c("sample","groundwater")], by = "groundwater")

The number of ASVs per sample are estimated with the specnumber() function from the Vegan library.

Expand code
adiv <- seqtab %>%
  select(-relab) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  column_to_rownames('sample') %>%
  vegan::specnumber() %>% as.data.frame() %>%
  rownames_to_column('sample') %>% #filter(sample %in% t0) %>%
  rename(specnumber = 2)

chem <- adiv %>% # Add the species richness for each sample
  inner_join(chem, by = "sample") 

In order to run the RDA, the absolute counts are standardized using a Hellinger transformation (square root of the relative abundance).

Expand code
hellinger <- seqtab %>%
  filter(sample %in% t0) %>%
  # Standard Vegan transformation: Turn table with with samples as *rows*
  select(sample, seqid, count) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  # Turn into a numeric matrix
  column_to_rownames('sample') %>% vegan::decostand(method = "hellinger") %>%
  rownames_to_column("sample") %>%
  pivot_longer(-1, names_to = "seqid", values_to = "hellinger")

m <- hellinger %>% # Create a matrix
  spread(seqid, hellinger) %>%
  column_to_rownames("sample")

The RDA is called while providing a large number of constraining factors, as the RDA function will by default not include redundant variables (Pearson r > 0.9).

Expand code
set.seed(999)

vegan::rda(
  m ~ o18 + doc + ph_lab + ec_lab + temp_w + nh4_n + feii, 
  correlation = T,
  data = chem %>%
    semi_join(chem, by = 'sample') %>%
    filter(sample %in% t0) %>%
    arrange(sample) %>%
    column_to_rownames('sample') 
) -> rda

Some constraints or conditions were aliased because they were redundant. This
can happen if terms are linearly dependent (collinear): 'ph_lab', 'ec_lab',
'temp_w', 'nh4_n', 'feii'
Expand code
# This one will contain the proportions explained which we get by dividing the
# eigenvalue of each component with the sum of eigenvalues for all components.
p <- c(rda$CCA$eig, rda$CA$eig)/sum(c(rda$CCA$eig, rda$CA$eig))
# This one is collecting the coordinates for arrows that will depict how the 
# factors in our model point in the coordinate
a <- rda$CCA$biplot %>% data.frame() %>% tibble::rownames_to_column('factor')

lab <- tibble(factor = c("o18", "doc"),
            label = c("delta^18*O", "DOC")
) %>% inner_join(a, by = "factor")

summary(rda)

Call:
rda(formula = m ~ o18 + doc + ph_lab + ec_lab + temp_w + nh4_n +      feii, data = chem %>% semi_join(chem, by = "sample") %>%      filter(sample %in% t0) %>% arrange(sample) %>% column_to_rownames("sample"),      correlation = T) 

Partitioning of variance:
              Inertia Proportion
Total          0.6761     1.0000
Constrained    0.3871     0.5725
Unconstrained  0.2890     0.4275

Eigenvalues, and their contribution to the variance 

Importance of components:
                        RDA1   RDA2     PC1     PC2     PC3     PC4     PC5
Eigenvalue            0.2338 0.1533 0.05443 0.04926 0.04333 0.02922 0.02273
Proportion Explained  0.3458 0.2267 0.08051 0.07286 0.06409 0.04322 0.03363
Cumulative Proportion 0.3458 0.5725 0.65305 0.72591 0.79001 0.83323 0.86686
                          PC6     PC7     PC8    PC9     PC10     PC11     PC12
Eigenvalue            0.01598 0.01338 0.01224 0.0098 0.008638 0.007691 0.007139
Proportion Explained  0.02363 0.01979 0.01810 0.0145 0.012776 0.011376 0.010559
Cumulative Proportion 0.89049 0.91028 0.92837 0.9429 0.955646 0.967022 0.977581
                          PC13     PC14     PC15
Eigenvalue            0.006254 0.004717 0.004186
Proportion Explained  0.009250 0.006977 0.006192
Cumulative Proportion 0.986831 0.993808 1.000000

Accumulated constrained eigenvalues
Importance of components:
                        RDA1   RDA2
Eigenvalue            0.2338 0.1533
Proportion Explained  0.6040 0.3960
Cumulative Proportion 0.6040 1.0000
Expand code
vegan::RsquareAdj(rda)
$r.squared
[1] 0.5725442

$adj.r.squared
[1] 0.5155501
Expand code
p2 <- rda$CCA$wa %>% data.frame() %>% tibble::rownames_to_column('sample') %>%
  inner_join(chem, by = 'sample') %>%
  ggplot(aes(x = RDA1, y = RDA2)) +
  geom_vline(xintercept = 0, colour = "grey", linetype = "dotted") +
  geom_hline(yintercept = 0, colour = "grey", linetype = "dotted") +
  geom_point(aes(colour = groundwater, size = specnumber), shape = 1, stroke = 0.5) +
  xlab(sprintf("RDA1 (%2.1f%% explained)", p[[1]] * 100)) +
  ylab(sprintf("RDA2 (%2.1f%% explained)", p[[2]] * 100)) +
  geom_segment(
    data = a, mapping = aes(xend = RDA1/5, yend = RDA2/5), linewidth = 0.3,
    x = 0, y = 0, arrow = arrow(length = unit(1.5, "mm"), type = "closed") 
    ) +
  geom_label(
    data = lab, mapping = aes(x = RDA1/10, y = RDA2/10, label = label),
    size = 6/.pt, parse = TRUE, label.size = 0, label.padding = unit(0.1, "lines")
    ) + 
  scale_size(range = c(1,8), name = "No. of ASVs") +
  annotate("text", x = -0.07, y =-0.3, label = "Meteoric", colour = cols.gw[1], size = 7/.pt) +
  annotate("text", x = 0.06, y = 0.28, label = "Marine", colour = cols.gw[2], size = 7/.pt) +
  annotate("text", x =  0.2, y = -0.05, label = "Saline", colour = cols.gw[3], size = 7/.pt) +
  stat_ellipse(aes(x = RDA1, y = RDA2, colour = groundwater), 
               geom = "polygon", show.legend = F,
               fill = NA, linetype = "dashed") +
  # Add R-squared
  annotate("text", x = Inf, y = -Inf, 
           label = "paste(R ^ 2, \" = 0.52\")", 
           size = 6/.pt, vjust = -0.5, hjust = 1.05, parse = TRUE) +
  scale_color_manual(name = "",values = cols.gw, guide = "none") +
  scale_fill_manual(name = "",values = cols.gw, guide = "none") +
  theme_tidy(fontsize = 6) +
  theme(
    aspect.ratio = 1.0,
    legend.background = element_rect(fill = NA),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.text = element_text(margin = margin(r = 12, unit = "pt")),
    legend.position = "inside",
    legend.position.inside  = c(0.9,0.85)
    )

Overview community structure t0

To visualize the community composition, the most abundant phyla are identified while grouping less abundant phyla as “Other”. ASVs that are not identified on phylum level are included as “Unidentified”.

Expand code
t <- seqtab %>%
  filter(sample %in% t0) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  filter(!is.na(phylum)) %>%
  top_n(8, mean_relab)

t %>% arrange(desc(mean_relab)) %>% knitr::kable()
phylum mean_relab
Patescibacteria 7.1312325
Desulfobacterota 2.7429408
Chloroflexota 1.2292858
Atribacterota 1.2125307
Acidobacteriota 0.9182271
Omnitrophota 0.7237157
Nitrospirota 0.3144247
Campylobacterota 0.2617040
Expand code
taxref <- tax %>%
  left_join(t %>% transmute(phylum, topphylum = phylum), by = "phylum") %>%
  mutate(topphylum = if_else(is.na(phylum), "Unidentified", topphylum)) %>%
  replace_na(list("topphylum" = "Other"))
Expand code
p3 <- seqtab %>%
  filter(sample %in% t0) %>%
  inner_join(taxref, by = "seqid") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, topphylum) %>% 
  summarise(mrelab = sum(relab), .groups = 'drop') %>%
  # Call the plot
  ggplot(aes(x = fct_relevel(sample, t0), 
             y = mrelab * 100, 
             fill = fct_relevel(topphylum, names(cols.phylum) %>% rev())
             )
         ) +
  labs(x = '', y = 'Relative abundance (%)', fill = "") +
  geom_col() + scale_y_continuous(expand = c(0.01,0), limits = c(-10,101)) +
  annotate(geom = "segment", x =  0.55, xend =  3.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  3.55, xend =  6.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  6.55, xend =  9.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  9.55, xend = 12.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 12.55, xend = 15.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 15.55, xend = 18.45, y = -1, yend = -1, linewidth = 0.5) +
  # Labels dates
  annotate(geom = "text", x =  2, y = -3.5, label = "Oct 2019", size = 6/.pt) +
  annotate(geom = "text", x =  5, y = -3.5, label = "April 2020", size = 6/.pt) +
  annotate(geom = "text", x =  8, y = -3.5, label = "Oct 2019", size = 6/.pt) +
  annotate(geom = "text", x = 11, y = -3.5, label = "April 2020", size = 6/.pt) +
  annotate(geom = "text", x = 14, y = -3.5, label = "Oct 2019", size = 6/.pt) +
  annotate(geom = "text", x = 17, y = -3.5, label = "April 2020", size = 6/.pt) +
  # Labels groundwater
  annotate(geom = "text", x = 3.5, y = -8, label = "Meteoric", size = 6/.pt) +
  annotate(geom = "text", x = 9.5, y = -8, label = "Marine", size = 6/.pt) +
  annotate(geom = "text", x =15.5, y = -8, label = "Saline", size = 6/.pt) +
  scale_fill_manual(values = cols.phylum) + 
  guides(fill = guide_legend(ncol = 3, reverse = TRUE)) +
  theme_tidy(legend = "bottom", fontsize = 6) + 
  theme(
    aspect.ratio = 0.75,
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank(),
    legend.key.height = unit(3.5, "mm"),
    legend.margin = margin(t = -20),
    legend.text = element_text(margin = margin(l = 2)),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.key.spacing.y = unit(0, "mm")
    )

Alpha diversity

Expand code
adiv <- seqtab %>%
  select(-relab) %>%
  spread(seqid, count, fill = 0) %>% 
  column_to_rownames('sample') %>%
  vegan::diversity() %>% as.data.frame() %>%
  rownames_to_column('sample') %>%
  rename(shannon = 2) %>%
  inner_join(smd, by = "sample")

adiv %>% filter(sample %in% t0) %>%
  group_by(groundwater) %>% 
  summarise(mean = round(mean(shannon), digits = 2), sd = round(sd(shannon), digits = 2), n = n())
# A tibble: 3 × 4
  groundwater  mean    sd     n
  <chr>       <dbl> <dbl> <int>
1 KR0015       5.66  0.62     6
2 SA1420       3.27  1.07     6
3 SA2600       3.03  0.44     6
Expand code
p4 <- adiv %>%
  filter(fraction != "control") %>%
  mutate(fraction = factor(fraction, levels = c("environmental","fractionated","non-fractionated"))) %>%
  ggplot(aes(
    x = groundwater,
    y = shannon,
    fill = fraction
    )) +
  geom_boxplot( outlier.colour = "white") +
  labs(x = "", y = "Shannon's diversity index (H)", fill = "") +
  theme_tidy(legend = "inside", fontsize = 6) +
  scale_x_discrete(labels = c("Meteoric", "Marine", "Saline")) + 
  scale_fill_manual(values = cols.fraction, labels = c("Environmental", "< 0.45 µm", "Non-fractionated")) +
  theme(
    aspect.ratio = 1.0,
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.margin = margin(),
    legend.background = element_rect(fill = "NA"),
    legend.position.inside = c(0.8,0.88),
    legend.text = element_text(margin = margin(l = 1))
  )
Expand code
p1 + p2 + p3 + p4 + plot_layout(nrow = 2) + plot_annotation(tag_levels = "a") & 
  theme(
    plot.margin = margin(t = 4, r = .1),
    plot.tag.location = "panel",
    plot.tag.position = c(-0.07, 1.0),
    plot.tag = element_text(size = 8, face = "bold")
    )

Expand code
ggsave("figures/fig-1.pdf", width = 180, height = 180, units = "mm")

Community structure on abundant groups (not tied to a specific rank)

Expand code
t <- tax %>%
  mutate(rank = case_when(
    class == "ABY1" ~ "ABY1",
    class == "Paceibacteria" ~ "Paceibacteria",
    class == "Microgenomatia" ~ "Microgenomatia",
    class == "JS1" ~ "JS1",
    order == "Syntrophales" ~ "Syntrophales",
    order == "Desulfobulbales" ~ "Desulfobulbales",
    order == "Anaerolineales" ~ "Anaerolineales",
    order == "Thermodesulfovibrionales" ~ "Thermodesulfovibrionales",
    order == "Gygaellales" ~ "Gygaellales",
    genus == "Sulfuricurvum" ~ "Sulfuricurvum",
    .default = NA
  )) %>% filter(!is.na(rank))

cols.rank <- c( # Sorted on abundance
  "ABY1" = "#C7B19C",
  "Paceibacteria" = "#A2A475",
  "Microgenomatia" = "#D69C4E",
  "JS1" = "#972D15",
  "Syntrophales" = "#FAEFD1",
  "Desulfobulbales" = "#00A08A",
  "Anaerolineales" = "#D8B70A",
  "Thermodesulfovibrionales" = "#1B5331",
  "Gygaellales" = "#899DA4",
  "Sulfuricurvum" = "#046C9A"
)
Expand code
seqtab %>%
  filter(sample %in% t0) %>%
  inner_join(t, by = "seqid") %>%
  group_by(sample, rank) %>% 
  summarise(mrelab = sum(relab), .groups = 'drop') %>%
  # Call the plot
  ggplot(aes(x = fct_relevel(sample, t0), 
             y = mrelab * 100, 
             fill = fct_relevel(rank, names(cols.rank) %>% rev())
             )
         ) +
  labs(x = '', y = 'Relative abundance (%)', fill = "") +
  geom_col() +
  annotate(geom = "segment", x =  0.55, xend =  3.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  3.55, xend =  6.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  6.55, xend =  9.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  9.55, xend = 12.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 12.55, xend = 15.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 15.55, xend = 18.45, y = -1, yend = -1, linewidth = 0.5) +
  # Labels dates
  annotate(geom = "text", x =  2, y = -3.5, label = "Oct 2019", size = 6/.pt) +
  annotate(geom = "text", x =  5, y = -3.5, label = "April 2020", size = 6/.pt) +
  annotate(geom = "text", x =  8, y = -3.5, label = "Oct 2019", size = 6/.pt) +
  annotate(geom = "text", x = 11, y = -3.5, label = "April 2020", size = 6/.pt) +
  annotate(geom = "text", x = 14, y = -3.5, label = "Oct 2019", size = 6/.pt) +
  annotate(geom = "text", x = 17, y = -3.5, label = "April 2020", size = 6/.pt) +
  # Labels groundwater
  annotate(geom = "text", x = 3.5, y = -8, label = "Meteoric", size = 6/.pt) +
  annotate(geom = "text", x = 9.5, y = -8, label = "Marine", size = 6/.pt) +
  annotate(geom = "text", x =15.5, y = -8, label = "Saline", size = 6/.pt) +
  scale_fill_manual(values = cols.rank) + 
  guides(fill = guide_legend(ncol = 3, reverse = TRUE)) +
  theme_tidy(legend = "bottom", fontsize = 6) + 
  theme(
    aspect.ratio = 0.75,
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank(),
    legend.key.height = unit(3.5, "mm"),
    legend.margin = margin(t = -20),
    legend.text = element_text(margin = margin(l = 2)),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.key.spacing.y = unit(0, "mm")
    )

Expand code
ggsave(filename = "figures/fig-s2.pdf", width = 120, height = 90, units = "mm")

Network analysis: phylum

Expand code
t <- c(
  "Spirochaetota", "Acidobacteriota", "Atribacterota", "Chloroflexota",
  "Omnitrophota", "Nanoarchaeota", "Campylobacterota", "Pseudomonadota"
)

network.survey <- read_tsv("data/network-survey.tsv", show_col_types = F)

# Correlation between the abundance of the phyla using the cultures
fit <- tibble("phylum" = t, "r.squared" = NA)
for (i in t) {
  model <- lm(relab ~ cpr, data = network.survey %>% filter(phylum == i)) %>% summary()
  fit[fit$phylum == i, "r.squared"] <- model$adj.r.squared %>% round(digits = 2)
}

fit
# A tibble: 8 × 2
  phylum           r.squared
  <chr>                <dbl>
1 Spirochaetota         0.57
2 Acidobacteriota       0.55
3 Atribacterota         0.51
4 Chloroflexota         0.14
5 Omnitrophota          0.14
6 Nanoarchaeota         0.08
7 Campylobacterota      0.05
8 Pseudomonadota       -0.01
Expand code
p1 <- network.survey %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = fit %>% mutate(phylum = factor(phylum, levels = phylum)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}:", r.squared)), 
            size = 6/.pt, hjust = 1, vjust = 0, parse = TRUE, label.size = 0, fill = "white") +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "Sqrt relative abundance Patescibacteria (%)", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~phylum, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
t <- c(
  "Omnitrophota", "Chloroflexota", "Pseudomonadota", "Actinomycetota",
  "Verrucomicrobiota", "Acidobacteriota", "Bacillota", "Spirochaetota"
)

network <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "non-fractionated") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum %in% t) %>%
  group_by(sample, phylum) %>% summarise(relab = sum(relab), .groups = "drop") 

network <- seqtab %>% 
  inner_join(tax, by = "seqid") %>% filter(phylum == "Patescibacteria") %>%
  group_by(sample, phylum) %>% summarise(cpr = sum(relab), .groups = "drop") %>% select(-phylum) %>%
  inner_join(network, by = "sample")

# Correlation between the abundance of the phyla using the cultures
fit <- tibble("phylum" = t, "r.squared" = NA)
for (i in t) {
  model <- lm(relab ~ cpr, data = network %>% filter(phylum == i)) %>% summary()
  fit[fit$phylum == i, "r.squared"] <- model$adj.r.squared %>% round(digits = 2)
}

fit <- fit %>% arrange(desc(r.squared))
fit
# A tibble: 8 × 2
  phylum            r.squared
  <chr>                 <dbl>
1 Omnitrophota           0.81
2 Chloroflexota          0.6 
3 Actinomycetota         0.03
4 Pseudomonadota         0.02
5 Bacillota             -0.01
6 Verrucomicrobiota     -0.02
7 Acidobacteriota       -0.02
8 Spirochaetota         -0.02
Expand code
n <- network %>%
  group_by(phylum) %>% summarise(no = n(), .groups = "drop") %>%
  inner_join(fit, by = "phylum") %>% mutate(no = as.character(no)) %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>% 
  arrange(phylum)
  
p2 <- network %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = n, 
            aes(x = Inf, y = -Inf, label = paste("R^{2}: ", r.squared, " (", no, ")", sep = "")), 
            size = 6/.pt, hjust = 1.0, vjust = -0.0, parse = TRUE, label.size = 0, fill = "white") +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "Sqrt relative abundance Patescibacteria (%)", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~phylum, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
p1 / p2 + plot_annotation(tag_levels = c("a", "b")) & theme(
  panel.border = element_rect(fill = "transparent", linewidth = 0.5),
  axis.ticks.length = unit(0.5, "mm"),
  plot.margin = margin(t = 4, r = 1),
  plot.tag = element_text(),
  plot.tag.position = c(0.01, 0.96),
  strip.background = element_blank(),
  strip.text = element_text(margin = margin(b = 2))
)

Expand code
ggsave("figures/fig-6.pdf", width = 160, height = 160, units = "mm")

Network analysis: order

Expand code
t <- c(
  "Spirochaetota", "Acidobacteriota", "Atribacterota", "Chloroflexota",
  "Omnitrophota", "Nanoarchaeota", "Campylobacterota", "Pseudomonadota"
)

# The file survey-amplicons.tsv contains the ASVs from Äspö groundwaters published in https://doi.org/10.3389/fmicb.2018.02880
survey.amplicons <- read_tsv("data/survey-amplicons.tsv.gz", show_col_types = FALSE) %>%
  group_by(sample) %>% mutate(relab = count / sum(count)) %>% ungroup()

cpr <- survey.amplicons %>%
  filter(phylum == "Patescibacteria") %>%
  group_by(sample, order) %>% summarise(cpr = sum(relab), .groups = "drop") 

network.survey <- survey.amplicons %>%
  filter(phylum %in% t) %>%
  group_by(sample, order) %>% summarise(relab = sum(relab), .groups = "drop") %>% 
  # Add colum with abundance of CPR
  left_join(cpr, by = "sample", relationship = "many-to-many") %>%
  rename(order.clade = order.x, order.cpr = order.y)

# The filter > 19 implies that the two orders have to co-occur at least ten times
fit <- network.survey %>%
  group_by(order.clade, order.cpr) %>% summarize(n = n(), .groups = "drop_last") %>% 
  filter(n > 19) %>% ungroup() %>% na.omit()

for (i in 1:nrow(fit)) {
  data = network.survey %>% filter(order.clade == fit[i,]$order.clade & order.cpr == fit[i,]$order.cpr)
  model <- lm(relab ~ cpr, data = data)
  fit[i, "r.squared"] <- summary(model)$r.squared
}

fit <- fit %>% slice_max(r.squared, n = 8) %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  mutate(r.squared = signif(.$r.squared, digits = 2)) %>%
  arrange(desc(r.squared)) %>%
  mutate(n = as.character(n))
Expand code
label <- c(
  "Limnocylindrales UBA9983" = "Limnocylindrales\n-\nUBA9983",
  "Limnocylindrales UBA12405" = "Limnocylindrales\n-\nUBA12405",
  "Limnocylindrales JAHJFT01" = "Limnocylindrales\n-\nJAHJFT01",
  "SCGC-AAA011-G17 UBA2169" = "SCGC-AAA011-G17\n-\nUBA2169",
  "Limnocylindrales JAHISW01" = "Limnocylindrales\n-\nJAHISW01",
  "UBA2200 JAHJFT01" = "UBA2200\n-\nJAHJFT01",
  "Limnocylindrales UBA1369" = "Limnocylindrales\n-\nUBA1369",
  "Limnocylindrales Paceibacterales" = "Limnocylindrales\n-\nPaceibacterales"
  )

p1 <- network.survey %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  filter(clade %in% fit$clade) %>%
  mutate(clade = factor(clade, levels = fit$clade)) %>% 
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = fit %>% mutate(clade = factor(clade, levels = clade)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}: ", r.squared, " (", n, ")", sep = "")), 
            size = 6/.pt, hjust = 1.0, vjust = -0.0, parse = TRUE, label.size = 0, fill = "white") +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "Sqrt relative abundance Patescibacteria (%)", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~clade, scales = "free", ncol = 4, labeller = as_labeller(label)) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
t <- c(
  "Omnitrophota", "Chloroflexota", "Pseudomonadota", "Actinobacteriota",
  "Verrucomicrobiota", "Acidobacteriota", "Bacillota", "Spirochaetota"
)

cpr <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "non-fractionated") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum == "Patescibacteria") %>%
  group_by(sample, order) %>% summarise(cpr = sum(relab), .groups = "drop") 

network <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "non-fractionated") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum %in% t) %>%
  group_by(sample, order) %>% summarise(relab = sum(relab), .groups = "drop") %>% 
  # Add colum with abundance of CPR
  left_join(cpr, by = "sample", relationship = "many-to-many") %>%
  rename(order.clade = order.x, order.cpr = order.y) 

# The filter > 19 implies that the two orders have to co-occur at least ten times
fit <- network %>%
  group_by(order.clade, order.cpr) %>% summarize(n = n(), .groups = "drop_last") %>% 
  filter(n > 19) %>% ungroup() %>% na.omit()

for (i in 1:nrow(fit)) {
  data = network %>% filter(order.clade == fit[i,]$order.clade & order.cpr == fit[i,]$order.cpr)
  model <- lm(relab ~ cpr, data = data)
  fit[i, "r.squared"] <- summary(model)$adj.r.squared
}
fit <- fit %>% slice_max(r.squared, n = 8) %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  mutate(r.squared = signif(.$r.squared, digits = 2)) %>%
  arrange(desc(r.squared)) %>%
  mutate(n = as.character(n))
Expand code
label = c(
  "2-01-FULL-45-10 BM507" = "2-01-FULL-45-10\n-\nBM507",
  "Gygaellales BM507" = "Gygaellales\n-\nBM507",
  "Gorgyraeales BM507" = "Gorgyraeales\n-\nBM507",
  "Pluralincolimonadales Portnoybacterales" = "Pluralincolimonadales\n-\nPortnoybacterales",
  "Gygaellales Magasanikbacterales" = "Gygaellales\n-\nMagasanikbacterales",
  "Pluralincolimonadales BM507" = "Pluralincolimonadales\n-\nBM507",
  "Gygaellales Portnoybacterales" = "Gygaellales\n-\nPortnoybacterales",
  "Gygaellales SG8-24" = "Gygaellales\n-\nSG8-24"
)


p2 <- network %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  filter(clade %in% fit$clade) %>%
  mutate(clade = factor(clade, levels = fit$clade)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_label(data = fit %>% mutate(clade = factor(clade, levels = clade)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}: ", r.squared, " (", n, ")", sep = "")), 
            size = 6/.pt, hjust = 1.0, vjust = -0.0, parse = TRUE, label.size = 0, fill = "white") +  
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "", y = "Sqrt relative abundance (%)") +
  theme_tidy(fontsize = 6) +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~clade, scales = "free", ncol = 4, labeller = as_labeller(label)) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
p1 / p2 +
  plot_annotation(tag_levels = c("a", "b")) & theme(
  panel.border = element_rect(fill = "transparent", linewidth = 0.5),
  axis.ticks.length = unit(0.5, "mm"),
  plot.margin = margin(b = 4, r = 1),
  plot.tag.position = c(0.01, 0.95)
)

Expand code
ggsave("figures/fig-s4.pdf", width = 140, height = 160, units = "mm")

16S data cultures

NMDS initial experiment (Fig 3)

Expand code
set.seed(999)

nmds <- seqtab %>% # Full size fraction
  inner_join(smd, by = "sample") %>%
  filter(!is.na(age) | sample %in% t0 & groundwater == "KR0015") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1204323 
Run 1 stress 0.1401622 
Run 2 stress 0.1321109 
Run 3 stress 0.1410864 
Run 4 stress 0.1233841 
Run 5 stress 0.1285199 
Run 6 stress 0.1257905 
Run 7 stress 0.1374948 
Run 8 stress 0.1369102 
Run 9 stress 0.1250981 
Run 10 stress 0.1155069 
... New best solution
... Procrustes: rmse 0.09176254  max resid 0.4493124 
Run 11 stress 0.1392887 
Run 12 stress 0.1288962 
Run 13 stress 0.1367449 
Run 14 stress 0.1391916 
Run 15 stress 0.1353133 
Run 16 stress 0.1186726 
Run 17 stress 0.1170498 
Run 18 stress 0.1260413 
Run 19 stress 0.1288964 
Run 20 stress 0.1185728 
*** Best solution was not repeated -- monoMDS stopping criteria:
    15: stress ratio > sratmax
     5: scale factor of the gradient < sfgrmin
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores
Expand code
i <- data.frame(x0 = -1,52, y0 = 0.5, 0.5)

p1 <- nmds.scores %>%
  mutate(medium = factor(medium, levels = c("ace","lys","env"))) %>%
  ggplot(aes(x = NMDS1, y = NMDS2, 
             shape = medium,
             fill = fraction, color = fraction
             )
         ) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  geom_point(size = 4.0, stroke = 0.75) + 
  geom_text(data = nmds.scores, 
            aes(label = age), 
            size = 6/.pt, color = "black", na.rm = TRUE) + 
  scale_shape_manual(values = c("ace" = 23, "lys" = 24, "env" = 21), 
                     labels = c("Acetate", "Lysate", "Environmental")) +
  scale_color_manual(values = cols.fraction, labels = c("Environmental", "< 0.45 µm", "Non-fractionated")) +
  scale_fill_manual(values = alpha(cols.fraction, alpha = 0.2), guide = "none") +
  annotate('text', x = -Inf, y = -Inf, size = 6/.pt, hjust = -0.1, vjust = -0.5, 
           label = paste('Stress = ', round(nmds$stress, digits = 2))) +
  theme_tidy(legend = "bottom", fontsize = 6) +
  ggforce::geom_circle(aes(x0 = -1.54, y0 = 0.52, r = 0.45),
              data = i,
              inherit.aes = FALSE,
              fill = NA, color = "black", linewidth = 0.3, linetype = "dashed") +
  theme(
    aspect.ratio = 1.0,
    legend.title = element_blank(),
    legend.box = "vertical",
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    legend.margin = margin(),
    legend.text = element_text(margin = margin(l = 2, r = 8, unit = "pt")),
    legend.key.spacing.x = unit(0, "mm")
    )

Barplot KR0015 (Fig 3)

Expand code
t <- seqtab %>%
  filter(sample %in% paste("P21015", seq(1042,1081), sep = "_")
           ) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  filter(!is.na(phylum)) %>%
  top_n(9, mean_relab) %>% filter(phylum != "Bacteroidota")

# The most abundant phyla, averaged over all cultures
t %>% arrange(desc(mean_relab))
# A tibble: 8 × 2
  phylum           mean_relab
  <chr>                 <dbl>
1 Pseudomonadota       10.8  
2 Acidobacteriota       7.90 
3 Spirochaetota         6.28 
4 Patescibacteria       6.16 
5 Desulfobacterota      3.22 
6 Bacillota             1.54 
7 Nanoarchaeota         0.967
8 Omnitrophota          0.853
Expand code
taxref <- tax %>%
  left_join(t %>% transmute(phylum, topphylum = phylum), by = "phylum") %>%
  replace_na(list("topphylum" = "Other"))

cols.phylum <- c(
  "Pseudomonadota" = "#972D15",
  "Acidobacteriota" = "#FAEFD1",
  "Spirochaetota" = "#D8B70A",
  "Patescibacteria" = "#C7B19C",
  "Desulfobacterota" = "#A2A475",
  "Bacillota" = "#1B5331",
  "Nanoarchaeota" = "#046C9A",
  "Omnitrophota" = "#00A08A",
  "Other" = "#899DA4"
)
Expand code
p2 <- seqtab %>%
  filter(sample %in% paste("P21015", seq(1042,1081), sep = "_")
           ) %>%
  inner_join(taxref, by = "seqid") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, topphylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>%
  mutate(age = as.numeric(.$age)) %>% 
  mutate(fraction = case_when(
    fraction == "non-fractionated" & medium == "ace" ~ "Acetate, non-fractionated",
    fraction == "non-fractionated" & medium == "lys" ~ "Lysate, non-fractionated",
    fraction == "fractionated" & medium == "ace" ~ "Acetate, < 0.45 µm fraction",
    fraction == "fractionated" & medium == "lys" ~ "Lysate, < 0.45 µm fraction"
  )) %>%
  mutate(topphylum = factor(topphylum, levels = rev(names(cols.phylum)))) %>%
  # Call the plot
  ggplot(aes(
    age, 
    relab * 100, 
    fill = topphylum)) +
  geom_col() + 
  scale_x_continuous(n.breaks = 10) +
  facet_wrap(~ fraction, nrow = 2) +
  labs(x = 'Week', y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum) + 
  guides(fill = guide_legend(reverse = TRUE, nrow = 3)) +
  theme_tidy(legend = "bottom", fontsize = 6) + 
  theme(
    aspect.ratio = 1.0,
    strip.background = element_blank(), 
    legend.background = element_blank(),
    legend.key.height = unit(4, "mm"), legend.key.width = unit(3, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.key.spacing.y = unit(0, "mm"),
    axis.ticks.x = element_blank(), axis.text.x = element_text(vjust = 2),
    legend.box.margin = margin(t = -5)
    )
Expand code
p1 + p2 + plot_annotation(tag_levels = "a") & theme(
  plot.tag.location = "plot",
  plot.tag.position = c(0.05,0.98),
  plot.margin = margin(r = 2),
  legend.box.margin = margin(t = -10)
)

Expand code
ggsave("figures/fig-3.pdf", height = 11, width = 18, units = "cm")

NMDS cultures three groundwaters (Fig S3)

Expand code
set.seed(999)

nmds <- seqtab %>% # Full size fraction
  inner_join(smd, by = "sample") %>%
  filter(is.na(age) & groundwater == "KR0015") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1162664 
Run 1 stress 0.1113097 
... New best solution
... Procrustes: rmse 0.04054519  max resid 0.1910308 
Run 2 stress 0.1113097 
... New best solution
... Procrustes: rmse 7.963381e-06  max resid 2.967843e-05 
... Similar to previous best
Run 3 stress 0.1113097 
... Procrustes: rmse 0.0001018374  max resid 0.000389594 
... Similar to previous best
Run 4 stress 0.1113097 
... Procrustes: rmse 5.762553e-05  max resid 0.0002199511 
... Similar to previous best
Run 5 stress 0.1162664 
Run 6 stress 0.1113097 
... Procrustes: rmse 3.109983e-05  max resid 0.0001187684 
... Similar to previous best
Run 7 stress 0.1113099 
... Procrustes: rmse 0.0002556252  max resid 0.0009758969 
... Similar to previous best
Run 8 stress 0.1162664 
Run 9 stress 0.1113097 
... Procrustes: rmse 0.0001426061  max resid 0.0005440469 
... Similar to previous best
Run 10 stress 0.1162664 
Run 11 stress 0.1113097 
... Procrustes: rmse 4.341062e-05  max resid 0.0001600677 
... Similar to previous best
Run 12 stress 0.1162664 
Run 13 stress 0.1162664 
Run 14 stress 0.1527016 
Run 15 stress 0.1113097 
... Procrustes: rmse 4.266142e-05  max resid 0.0001623409 
... Similar to previous best
Run 16 stress 0.1527132 
Run 17 stress 0.1162664 
Run 18 stress 0.1113097 
... Procrustes: rmse 3.160745e-05  max resid 0.0001207792 
... Similar to previous best
Run 19 stress 0.1527016 
Run 20 stress 0.1829632 
*** Best solution repeated 9 times
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores

Plot the ordination

Expand code
p1 <- nmds.scores %>%
  mutate(medium = factor(medium, levels = c("ace","lys","env"))) %>%
  ggplot(aes(x = NMDS1, y = NMDS2, 
             shape = medium,
             fill = fraction, color = fraction
             )
         ) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  geom_point(size = 4.0, stroke = 0.75) + 
  geom_text(data = nmds.scores, 
            aes(label = age), 
            size = 6/.pt, color = "black", na.rm = TRUE) + 
  scale_shape_manual(values = c("ace" = 23, "lys" = 24, "env" = 21), 
                     labels = c("Acetate", "Lysate", "Environmental")) +
  scale_color_manual(values = cols.fraction, labels = c("Environmental", "< 0.45 µm", "Non-fractionated")) +
  scale_fill_manual(values = alpha(cols.fraction, alpha = 0.2), guide = "none") +
  annotate('text', x = -Inf, y = -Inf, size = 6/.pt, hjust = -0.1, vjust = -0.5, 
           label = paste('Stress = ', round(nmds$stress, digits = 2))) +
  theme_tidy(legend = "bottom", fontsize = 6) +
  theme(
    #axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    legend.title = element_blank(),
    legend.box = "vertical",
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    legend.margin = margin(),
    legend.text = element_text(margin = margin(l = 2, r = 8, unit = "pt")),
    legend.key.spacing.x = unit(0, "mm")
    )
Expand code
set.seed(999)

nmds <- seqtab %>% # Full size fraction
  inner_join(smd, by = "sample") %>%
  filter(is.na(age) & groundwater == "SA1420") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1463847 
Run 1 stress 0.1728024 
Run 2 stress 0.1827598 
Run 3 stress 0.1463848 
... Procrustes: rmse 0.0001759352  max resid 0.0004703476 
... Similar to previous best
Run 4 stress 0.1463848 
... Procrustes: rmse 0.0001427329  max resid 0.0004618032 
... Similar to previous best
Run 5 stress 0.1728024 
Run 6 stress 0.1711521 
Run 7 stress 0.1858381 
Run 8 stress 0.1463847 
... Procrustes: rmse 6.710846e-05  max resid 0.0002414644 
... Similar to previous best
Run 9 stress 0.1728024 
Run 10 stress 0.1463848 
... Procrustes: rmse 0.0001870214  max resid 0.0007340409 
... Similar to previous best
Run 11 stress 0.1463846 
... New best solution
... Procrustes: rmse 0.0001077695  max resid 0.0004026575 
... Similar to previous best
Run 12 stress 0.1976911 
Run 13 stress 0.1825336 
Run 14 stress 0.1463846 
... Procrustes: rmse 8.697484e-05  max resid 0.0003340923 
... Similar to previous best
Run 15 stress 0.1737588 
Run 16 stress 0.1867263 
Run 17 stress 0.1753243 
Run 18 stress 0.1463847 
... Procrustes: rmse 0.0001946414  max resid 0.0007199926 
... Similar to previous best
Run 19 stress 0.1463847 
... Procrustes: rmse 0.000187155  max resid 0.0006815765 
... Similar to previous best
Run 20 stress 0.1728024 
*** Best solution repeated 4 times
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores
Expand code
p2 <- nmds.scores %>%
  mutate(medium = factor(medium, levels = c("ace","lys","env"))) %>%
  ggplot(aes(x = NMDS1, y = NMDS2, 
             shape = medium,
             fill = fraction, color = fraction
             )
         ) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  geom_point(size = 4.0, stroke = 0.75) + 
  geom_text(data = nmds.scores, 
            aes(label = age), 
            size = 6/.pt, color = "black", na.rm = TRUE) + 
  scale_shape_manual(values = c("ace" = 23, "lys" = 24, "env" = 21), 
                     labels = c("Acetate", "Lysate", "Environmental")) +
  scale_color_manual(values = cols.fraction, labels = c("Environmental", "< 0.45 µm", "Non-fractionated")) +
  scale_fill_manual(values = alpha(cols.fraction, alpha = 0.2), guide = "none") +
  annotate('text', x = -Inf, y = -Inf, size = 6/.pt, hjust = -0.1, vjust = -0.5, 
           label = paste('Stress = ', round(nmds$stress, digits = 2))) +
  theme_tidy(legend = "bottom", fontsize = 6) +
  theme(
    axis.title.y = element_blank(),
    legend.title = element_blank(),
    legend.box = "vertical",
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    legend.margin = margin(),
    legend.text = element_text(margin = margin(l = 2, r = 8, unit = "pt")),
    legend.key.spacing.x = unit(0, "mm")
    )
Expand code
set.seed(999)

nmds <- seqtab %>% # Full size fraction
  inner_join(smd, by = "sample") %>%
  filter(is.na(age) & groundwater == "SA2600") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.07165588 
Run 1 stress 0.07165587 
... New best solution
... Procrustes: rmse 6.783627e-05  max resid 0.0002209466 
... Similar to previous best
Run 2 stress 0.1058458 
Run 3 stress 0.07540527 
Run 4 stress 0.07540524 
Run 5 stress 0.1058377 
Run 6 stress 0.1058456 
Run 7 stress 0.07165587 
... New best solution
... Procrustes: rmse 6.929929e-05  max resid 0.00022789 
... Similar to previous best
Run 8 stress 0.113827 
Run 9 stress 0.1289874 
Run 10 stress 0.1058384 
Run 11 stress 0.2466964 
Run 12 stress 0.1058454 
Run 13 stress 0.07165587 
... Procrustes: rmse 7.472214e-05  max resid 0.0002464097 
... Similar to previous best
Run 14 stress 0.07165587 
... New best solution
... Procrustes: rmse 6.164708e-05  max resid 0.0002032928 
... Similar to previous best
Run 15 stress 0.1058389 
Run 16 stress 0.113827 
Run 17 stress 0.07165591 
... Procrustes: rmse 5.919096e-05  max resid 0.0001889389 
... Similar to previous best
Run 18 stress 0.07165592 
... Procrustes: rmse 8.455526e-05  max resid 0.0002765345 
... Similar to previous best
Run 19 stress 0.105838 
Run 20 stress 0.1226938 
*** Best solution repeated 3 times
Warning in postMDS(out$points, dis, plot = max(0, plot - 1), ...): skipping
half-change scaling: too few points below threshold
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores
Expand code
p3 <- nmds.scores %>%
  mutate(medium = factor(medium, levels = c("ace","lys","env"))) %>%
  ggplot(aes(x = NMDS1, y = NMDS2, 
             shape = medium,
             fill = fraction, color = fraction
             )
         ) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  geom_point(size = 4.0, stroke = 0.75) + 
  geom_text(data = nmds.scores, 
            aes(label = age), 
            size = 6/.pt, color = "black", na.rm = TRUE) + 
  scale_shape_manual(values = c("ace" = 23, "lys" = 24, "env" = 21), 
                     labels = c("Acetate", "Lysate", "Environmental"), guide = "none") +
  scale_color_manual(values = cols.fraction, labels = c("Environmental", "< 0.45 µm", "Non-fractionated"), guide = "none") +
  scale_fill_manual(values = alpha(cols.fraction, alpha = 0.2), guide = "none") +
  annotate('text', x = -Inf, y = -Inf, size = 6/.pt, hjust = -0.1, vjust = -0.5, 
           label = paste('Stress = ', round(nmds$stress, digits = 2))) +
  theme_tidy(legend = "bottom", fontsize = 6) +
  theme(
    axis.title.y = element_blank()
  )
Expand code
p1 + p2 + p3 + plot_layout(nrow = 1, guides = "collect") + plot_annotation(tag_levels = "a") & theme(
  legend.position = "bottom",
  aspect.ratio = 1.0,
  plot.tag.location = "panel",
  plot.tag.position = c(0.04, 0.97),
  plot.margin = margin(t = 3, r = 3),
  panel.border = element_rect(fill = "transparent", linewidth = 0.75)
)

Expand code
ggsave("figures/fig-s2.pdf", height = 90, width = 180, units = "mm")

QPCR KR0015 (Fig 2)

Expand code
qpcr <- read_tsv('data/qpcr.tsv', col_types = cols(.default = col_character(), 
                                      quantity = col_double(),
                                      age = col_factor(levels = seq(1:10) %>% as.character())
                                      )
         ) %>% mutate(medium = gsub("ace","srm", medium)) %>%
  rename(kingdom = Kingdom)
Expand code
p1 <- qpcr %>%
  group_by(sample, kingdom) %>% 
  summarise(mean = mean(quantity), sd = sd(quantity), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>%
  mutate(category = paste(medium, fraction)) %>%
  # Call plot
  ggplot(aes(x = fct_relevel(age, seq(1:10) %>% as.character), 
             y = mean, 
             group = category, 
             fill = fraction
             )
         ) +
  geom_line(aes(linetype = medium)) +
  facet_wrap(~ factor(kingdom, levels = c("Bacteria", "Archaea"))) +
  geom_pointrange(aes(ymin = mean - sd, 
                      ymax = mean + sd), 
                  shape = 21, stroke = 0.5) +
  labs(x = 'Week', y = expression("Gene copy number (mL"^"-1"*")")) +
  scale_y_log10(breaks = c(1e3,1e4,1e5,1e6,1e7), 
                labels = c(expression("10"^"3"),
                           expression("10"^"4"),
                           expression("10"^"5"),
                           expression("10"^"6"),
                           expression("10"^"7")
                           )
                ) +
  scale_linetype(name = "Medium:", labels = c("Acetate", "Cell lysate")) +
  scale_fill_manual(values = cols.fraction, 
                    labels = c("< 0.45 µm fraction", "Non-fractionated"),
                    name = "") +
  theme_tidy(legend = "right") +
  theme(
    plot.margin = margin(),
    legend.text = element_text(margin = margin(l = 0)),
    axis.ticks.length = unit(0.5, "mm"),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.box.margin = margin(-8),
    )

Microscopy (Fig 2)

Expand code
growth <- read_tsv("data/micro_cultures.tsv", col_types = cols(.default = col_character(), 
                                                  count = col_integer(),
                                                  date = col_date(format = "%d-%m-%Y"))
                   ) %>%
  mutate(week = case_when(
    date == "2021-10-15" ~  1,
    date == "2021-11-11" ~  5,
    date == "2021-11-25" ~  7,
    date == "2021-12-09" ~  9,
    date == "2021-12-30" ~ 12,
    date == "2022-01-18" ~ 15,
    date == "2022-02-03" ~ 17
  )) %>%
  filter(!sample %in% c("P26201_1051","P26201_1024")) %>%
  mutate(label = case_when(
    groundwater == "KR0015" ~  "Meteoric",
    groundwater == "SA1420" ~  "Marine",
    groundwater == "SA2600" ~  "Saline"
  ))

counts <- tibble(
  label = c("Meteoric","Marine","Saline"),
  min = c(680220, 59220, 14100),
  max = c(890910, 73910, 23000)
  )
Expand code
# Microscopy counts on KR0015 groundwater in duplicates, obtaining 680,220 and 890,910 cells ml-1.
# Microscopy counts on SA1420 groundwater in duplicates, obtaining 59220 and 73910 cells ml-1.
# For SA2600, this number was 14,100 and 23,000 cells ml-1

p2 <- growth %>%
  ggplot(aes(x = week, y = count, fill = fraction)) +
  geom_rect(data = counts, aes(xmin = 1, xmax = 17, ymin = min, ymax = max), fill = "#8D8680", alpha = 0.5, inherit.aes = FALSE) +
  geom_line(aes(linetype = medium)) + 
  geom_point(aes(fill = fraction), shape = 21, stroke = 0.5, size = 2) +
  scale_y_log10(limits = c(1e4,1e8),
                breaks = c(1e4,1e5,1e6,1e7),
                labels = c(expression("10"^"4"),
                           expression("10"^"5"),
                           expression("10"^"6"),
                           expression("10"^"7"))
                ) +
  scale_x_continuous(breaks = c(1,5,7,9,12,15,17), 
                     labels = c("1", "5", "", "9", "12", "", "17")) +
  scale_fill_manual(values = cols.fraction, guide = "none") +
  scale_linetype_manual(values = c("srm" = 1, "lys" = 2), name = "Medium:", 
                 labels = c("Acetate", "Cell lysate"), 
                 guide = "none") +
  facet_wrap(~ factor(label, levels = c("Meteoric", "Marine", "Saline")), ncol = 3) +
  labs(x = "Week", y = expression("Cell number (mL"^"-1"*")"), colour = "Fraction:") +
  theme_tidy(legend = "bottom") + 
  theme(
    legend.margin = margin(t = -10, l = 20),
    plot.margin = margin(),
    panel.border = element_rect(fill = "transparent", linewidth = 0.5)
    )
Expand code
p1 + p2 + plot_layout(nrow = 2, heights = c(3,2), guides = "collect") + plot_annotation(tag_levels = "a") & theme(
  aspect.ratio = 1.0,
  plot.margin = margin(b = 6),
  legend.position = "bottom",
  plot.tag.location = "plot",
  plot.tag.position = c(0.04, 0.96),
  legend.box.margin = margin(t = -8)
  )

Expand code
ggsave(filename = "figures/fig-2.pdf", width = 14, height = 14, units = "cm")

Barplot KR0015 - SA1420 - SA2600

Expand code
t <- seqtab %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "KR0015") %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  top_n(8, mean_relab) %>% arrange(desc(mean_relab)) %>% pull(phylum)
Expand code
cols.phylum <- c(
  "Pseudomonadota" = "#972D15",
  "Bacillota" = "#1B5331",
  "Bacteroidota" = "#D3D4D8",
  "Desulfobacterota" = "#A2A475",
  "Patescibacteria" = "#C7B19C",
  "Omnitrophota" = "#00A08A",
  "Spirochaetota" = "#D8B70A",
  "Campylobacterota" = "#79402E",
  "Acidobacteriota" = "#FAEFD1",
  "Nanoarchaeota" = "#046C9A",
  "Actinomycetota" = "#C6CDF7",
  "Verrucomicrobiota" = "#D69C4E",
  "Other" = "#899DA4"
)
Expand code
p1 <- seqtab %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>% 
  mutate(phylum = if_else(phylum %in% t, phylum, "Other")) %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, phylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "KR0015") %>%
  mutate(phylum = factor(phylum, levels = names(rev(cols.phylum)))) %>%
  mutate(fraction = case_when(
    fraction == "fractionated" & medium == "lys" ~ "Fractionated: lysate",
    fraction == "fractionated" & medium == "ace" ~ "Fractionated: acetate",
    fraction == "non-fractionated" & medium == "lys" ~ "Non-fractionated: lysate",
    fraction == "non-fractionated" & medium == "ace" ~ "Non-fractionated: acetate",
  )) %>%
   # Call the plot
  ggplot(aes(
    sample, 
    relab * 100, 
    fill = phylum)) +
  geom_col(show.legend = TRUE) + 
  facet_wrap(~ fraction, scales = "free_x", nrow = 1) +
  labs(x = "", y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum, drop = FALSE) + 
  theme_tidy(legend = "bottom") + guides(fill = guide_legend(nrow = 3, reverse = TRUE)) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.key.width = unit(3.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -10)
  )
Expand code
t <- seqtab %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA1420") %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  top_n(8, mean_relab) %>% arrange(desc(mean_relab)) %>% pull(phylum)
Expand code
p2 <- seqtab %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>% 
  mutate(phylum = if_else(phylum %in% t, phylum, "Other")) %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, phylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA1420") %>%
  mutate(phylum = factor(phylum, levels = names(rev(cols.phylum)))) %>%
  mutate(fraction = case_when(
    fraction == "fractionated" & medium == "lys" ~ "Fractionated: lysate",
    fraction == "fractionated" & medium == "ace" ~ "Fractionated: acetate",
    fraction == "non-fractionated" & medium == "lys" ~ "Non-fractionated: lysate",
    fraction == "non-fractionated" & medium == "ace" ~ "Non-fractionated: acetate",
  )) %>%
   # Call the plot
  ggplot(aes(
    sample, 
    relab * 100, 
    fill = phylum)) +
  geom_col(show.legend = TRUE) + 
  facet_wrap(~ fraction, scales = "free_x", nrow = 1) +
  labs(x = "", y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum, drop = FALSE) + 
  theme_tidy(legend = "bottom") + guides(fill = guide_legend(nrow = 3, reverse = TRUE)) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.key.width = unit(3.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -10)
  )
Expand code
t <- seqtab %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA2600") %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  top_n(8, mean_relab) %>% arrange(desc(mean_relab)) %>% pull(phylum)
Expand code
p3 <- seqtab %>%
  filter(grepl("P26", sample)) %>%
  inner_join(tax, by = "seqid") %>% 
  mutate(phylum = if_else(phylum %in% t, phylum, "Other")) %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, phylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>% filter(groundwater == "SA2600") %>%
  mutate(phylum = factor(phylum, levels = names(rev(cols.phylum)))) %>%
  mutate(fraction = case_when(
    fraction == "fractionated" & medium == "lys" ~ "Fractionated: lysate",
    fraction == "fractionated" & medium == "ace" ~ "Fractionated: acetate",
    fraction == "non-fractionated" & medium == "lys" ~ "Non-fractionated: lysate",
    fraction == "non-fractionated" & medium == "ace" ~ "Non-fractionated: acetate",
  )) %>%
   # Call the plot
  ggplot(aes(
    sample, 
    relab * 100, 
    fill = phylum)) +
  geom_col(show.legend = TRUE) + 
  facet_wrap(~ fraction, scales = "free_x", nrow = 1) +
  labs(x = "", y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum, drop = FALSE) + 
  theme_tidy(legend = "bottom") + guides(fill = guide_legend(nrow = 3, reverse = TRUE)) +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.key.width = unit(3.5, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -10)
  )
Expand code
p1 + p2 + p3 + plot_layout(nrow = 3, ncol = 1, guides = "collect", widths = c(2,2,1)) + 
  plot_annotation(tag_levels = "a") & theme(
  axis.ticks.x = element_blank(),
  axis.text.x = element_blank(),
  axis.title.x = element_blank(),
  plot.tag.location = "panel",
  plot.tag.position = c(-0.05, 1.06),
  legend.position = "bottom",
  legend.box.margin = margin(t = 10),
  aspect.ratio = 1
)

Expand code
ggsave(filename = "figures/fig-s3.pdf", width = 180, height = 180, units = "mm")

Genome-resolved metagenomics

Basic stats on the MAGs

Expand code
t <- c("Bacillota","Desulfobacterota","Pseudomonadota","Patescibacteria","Nanoarchaeota","Other")

# Prepare data for plot
i <- quality %>%
  inner_join(bintax, by = "genome") %>%
  mutate(phylum = ifelse(phylum %in% t, phylum, "Other")) %>%
  mutate(phylum = factor(phylum, levels = t)) %>%
  select(genome, phylum, completeness, gc_content, genome_size, coding_density) %>%
  distinct() %>%
  mutate(genome.size = genome_size / 1e6)
Expand code
lm(gc_content ~ genome_size, data = quality) %>% summary()

Call:
lm(formula = gc_content ~ genome_size, data = quality)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.15655 -0.06232 -0.02604  0.07979  0.13851 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.058e-01  3.670e-02   8.332 1.26e-09 ***
genome_size 5.080e-08  1.029e-08   4.935 2.23e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.09515 on 33 degrees of freedom
Multiple R-squared:  0.4246,    Adjusted R-squared:  0.4072 
F-statistic: 24.36 on 1 and 33 DF,  p-value: 2.233e-05
Expand code
p1 <- i %>%
  ggplot(aes(
    genome.size, 
    gc_content, 
    color = phylum,
    fill = phylum
    )) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#d9d9d9", se = TRUE, linewidth = 0.4) +
  geom_point(size = 2.5, shape = 21, stroke = 0.5, key_glyph = "rect") + 
  labs(x = "Estimated genome size (Mb)", y = "Estimated GC content (%)", color = "") +
  scale_colour_manual(values = cols.mag, guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5)) +
  theme_tidy(fontsize = 6) + 
  scale_x_continuous(breaks = c(1,2,3,4,5,6)) +
  annotate("text", x = Inf, y = -Inf, 
           label = "paste(R ^ 2, \" = 0.41\")", 
           size = 6/.pt, vjust = -0.5, hjust = 1, parse = TRUE) +
  theme(
    axis.line = element_line(),
    panel.border = element_blank()
  )
Expand code
p2 <- i %>%
  mutate(phylum = factor(phylum, levels = c(
    "Nanoarchaeota", "Patescibacteria", "Other", "Bacillota", "Desulfobacterota", "Pseudomonadota"
  ))) %>%
  ggplot(aes(
    phylum, 
    completeness, 
    group = phylum, 
    fill = phylum, 
    color = phylum
    )) +
  geom_boxplot(linewidth = 0.5, outlier.colour = "white") +
  geom_jitter(width = 0.2, size = 2, shape = 21, stroke = 0.5) +
  labs(x = "", y = "Estimated completeness (%)", fill = "") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), guide = "none") + 
  scale_color_manual(values = cols.mag, guide = "none") +
  theme_tidy(legend = "bottom", fontsize = 6) +
  theme(
    axis.line = element_line(),
    panel.border = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_text(margin = margin()),
    plot.margin = margin()
  )
Expand code
lm(coding_density ~ genome_size, data = quality) %>% summary()

Call:
lm(formula = coding_density ~ genome_size, data = quality)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.05556 -0.01303 -0.00046  0.01275  0.03787 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.172e-01  8.112e-03 113.064  < 2e-16 ***
genome_size -6.991e-09  2.275e-09  -3.072  0.00424 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.02103 on 33 degrees of freedom
Multiple R-squared:  0.2224,    Adjusted R-squared:  0.1989 
F-statistic: 9.439 on 1 and 33 DF,  p-value: 0.004236
Expand code
p3 <- i %>%
  ggplot(aes(
    genome.size, 
    coding_density, 
    color = phylum,
    fill = phylum
    )) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#d9d9d9", se = TRUE, linewidth = 0.4) +
  geom_point(size = 2.5, shape = 21, stroke = 0.5) + 
  labs(x = "Estimated genome size (Mb)", y = "Estimated coding density", color = "") +
  scale_colour_manual(values = cols.mag, guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), guide = "none") +
  theme_tidy(fontsize = 6) + 
  scale_x_continuous(breaks = c(1,2,3,4,5,6)) +
  scale_y_continuous(n.breaks = 7) +
  annotate("text", x = 6, y = 0.93, 
           label = "paste(R ^ 2, \" = 0.20\")", 
           size = 6/.pt, vjust = 1, hjust = 1, parse = TRUE) +
  theme(
    axis.line = element_line(),
    panel.border = element_blank()
  )
Expand code
nmds <- coverm %>%
  # Sum the abundance of both coverage per metagenome
  mutate(abundance = 1) %>% 
  inner_join(emapper, by = "genome", relationship = "many-to-many") %>%
  # Filter out NA and add multiple KOs on new line
  separate_rows(ko, sep = ",") %>%
  # Sum the abundance of each KO within the metagenomes
  group_by(genome, ko) %>% 
  summarise(abundance = sum(abundance), .groups = "drop") %>%
  # Pivot to prepare a numerical matrix
  pivot_wider(names_from = "ko", values_from = "abundance", values_fill = 0) %>%
  column_to_rownames("genome") %>%
  vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.06432893 
Run 1 stress 0.06458387 
... Procrustes: rmse 0.06357583  max resid 0.1237997 
Run 2 stress 0.06407586 
... New best solution
... Procrustes: rmse 0.06501158  max resid 0.1229768 
Run 3 stress 0.06649645 
Run 4 stress 0.06967825 
Run 5 stress 0.06485684 
Run 6 stress 0.06627825 
Run 7 stress 0.06999917 
Run 8 stress 0.06432922 
... Procrustes: rmse 0.06485113  max resid 0.1250963 
Run 9 stress 0.06574132 
Run 10 stress 0.06486508 
Run 11 stress 0.06672071 
Run 12 stress 0.06681644 
Run 13 stress 0.0691574 
Run 14 stress 0.06509552 
Run 15 stress 0.06552364 
Run 16 stress 0.06564337 
Run 17 stress 0.06432906 
... Procrustes: rmse 0.06487116  max resid 0.1250532 
Run 18 stress 0.06568593 
Run 19 stress 0.0641662 
... Procrustes: rmse 0.002380285  max resid 0.01093463 
Run 20 stress 0.065016 
*** Best solution was not repeated -- monoMDS stopping criteria:
     2: no. of iterations >= maxit
    18: stress ratio > sratmax
Expand code
nmds.scores <- vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("genome")
Expand code
p4 <- nmds.scores %>%
  inner_join(bintax, by = "genome") %>%
  mutate(phylum = ifelse(phylum %in% t, phylum, "Other")) %>%
  # Plot
  ggplot(aes(x = NMDS1 * -1, 
             y = NMDS2, 
             colour = fct_relevel(phylum,t),
             fill = phylum
             )) +
  geom_vline(xintercept = 0, linetype = 'dotted', linewidth = 0.5) +
  geom_hline(yintercept = 0, linetype = 'dotted', linewidth = 0.5) +
  # Points for samples, coloured by nature
  geom_point(size = 3, shape = 21, stroke = 0.5) +
  theme_tidy(fontsize = 6) + labs(x = "NMDS1", y = "NMDS2") +
  scale_color_manual(values = cols.mag, name = "Phylum", guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), name = "", guide = "none") +
  scale_y_continuous(breaks = c(-0.5, 0, 0.5)) + 
  annotate('text', x = -Inf, y = -Inf, size = 6/.pt,
           label = paste('Stress = ', round(nmds$stress, digits = 2)),
           hjust = -0.1, vjust = -0.5) +
  theme(
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    axis.title.y = element_text(margin = margin()),
    legend.spacing.x = unit(-0.5, "mm")
    )
Expand code
p1 + p2 + p3 + p4 + plot_layout(guides = "collect", nrow = 2) & plot_annotation(tag_levels = c("a", "b", "c", "d")) &
  theme(
    aspect.ratio = 1.0,
    plot.margin = margin(),
    plot.tag.location = "panel",
    plot.tag.position = c(0.04, 0.95),
    legend.position = "bottom",
    axis.ticks.length = unit(".75", "mm"),
    legend.margin = margin(t = -4),
    legend.text = element_text(hjust = 0),
    legend.key.size = unit("3", "mm"),
    legend.title = element_blank()
    )

Expand code
ggsave(filename = "figures/fig-4.pdf", width = 12, height = 12, units = "cm")

KO mapper

Expand code
km <- data.frame()
for (file in list.files("data/kmapper/", full.names = TRUE)) {
  genome <- substr(file, 15, nchar(file)) %>% gsub(".txt", "", .)
  i <- read_tsv(file, col_names = FALSE, show_col_types = FALSE)
  i <- i %>% filter(grepl("M00", X1)) %>%
    mutate(genome = genome) %>%
    mutate(module = substr(X1, 1, 6)) %>%
    mutate(completeness = sub(".*\\s", "", X1)) %>%
    mutate(completeness = gsub(")", "", completeness)) %>%
    select(-X1)
  km <- km %>% rbind(i)
}

km <- km %>%
  mutate(genes = gsub("/.*", "", completeness) %>% as.numeric()) %>%
  mutate(total = gsub(".*/", "", completeness) %>% as.numeric()) %>%
  mutate(score = case_when(
    genes == total ~ "c",
    (total - genes) == 1 ~ "i",
    .default = "m"
  )) %>%
  select(genome, module, score) %>%
  # Assign a value for every genome-module combination
  pivot_wider(names_from = "module", values_from = "score", values_fill = "m") %>%
  pivot_longer(-1, values_to = "score", names_to = "module")
Expand code
km <- km %>% mutate(label = case_when(
    module == "M00001" ~ "M00001: Glycolysis",
    module == "M00002" ~ "M00002: Glycolysis core module",
    #module == "M00003" ~ "M00003: Gluconeogenesis",
    module == "M00307" ~ "M00307: Pyruvate oxidation",
    module == "M00008" ~ "M00008: Enter-Doudoroff pathway",
    module == "M00009" ~ "M00009: TCA cycle",
    module == "M00004" ~ "M00004: Pentose phosphate pathway (PPP)",
    module == "M00007" ~ "M00007: PPP, non-oxidative",
    module == "M00082" ~ "M00082: Fatty acids biosynthesis",
    module == "M00549" ~ "M00549: UDP-Glc biosynthesis",
    module == "M00909" ~ "M00909: UDP-GlcNAc biosynthesis",
    #module == "M00020" ~ "M00020: AA biosynthesis: serine",
    module == "M00016" ~ "M00016: AA biosynthesis: lysine",
    module == "M00021" ~ "M00021: AA biosynthesis: cysteine",
    module == "M00019" ~ "M00019: AA biosynthesis: valine/isoleucine",
    #module == "M00017" ~ "M00017: AA biosynthesis: methionine",
    #module == "M00048" ~ "M00048: De novo purine biosynthesis",
    module == "M00053" ~ "M00053: Deoxyribonucleotide biosynthesis",
    #module == "M00051" ~ "M00051: De novo pyrimidine biosynthesis",
    module == "M00052" ~ "M00052: Pyrimidine ribonucleotide biosynthesis",
    .default = NA
  ))

cols.phylum <- c(
  "Desulfobacterota" = "#A2A475",
  "Pseudomonadota" = "#972D15",
  "Bacillota" = "#1B5331",
  "Patescibacteria" = "#C7B19C",
  "Nanoarchaeota" = "#00A08A",  
  "Chloroflexota" = "#D69C4E",
  "Spirochaetota" = "#D8B70A",
  "Campylobacterota" = "#FAEFD1",
  "Iainarchaeota" = "#046C9A"
)

name <- bintax %>%
  filter(genome %in% km$genome) %>%
  mutate(phylum = factor(phylum, levels = names(cols.phylum))) %>%
  arrange(phylum) %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  pull(taxon) %>% unique()
Expand code
p1 <- km %>%
  inner_join(bintax, by = "genome") %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  mutate(taxon = factor(taxon, levels = name)) %>%
  filter(!is.na(label)) %>%
  ggplot(aes(
    x = taxon,
    y = fct_rev(label),
    fill = score
  )) +
  geom_tile(color = "white") +
  scale_x_discrete(position = "top") +
  scale_fill_manual(values = c("c" = "#707151",
                               "i" = "#B6B79B",
                               "m" = "#F0F1EB"),
                    label = c("c" = "Complete",
                              "i" = "One block missing",
                              "m" = "Incomplete or missing"),
                    name = "Module completeness:") +
  theme_tidy(fontsize = 6) + coord_fixed() +
  geom_vline(xintercept =  7.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 13.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 19.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 22.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 24.5, linewidth = 0.6, color = "white") +
  annotate("segment", x =    0.5, xend = 7.45, y = 16, yend = 16, color = "#A2A475", linewidth = 2) +
  annotate("segment", x =  7.55, xend = 13.45, y = 16, yend = 16, color = "#972D15", linewidth = 2) +
  annotate("segment", x = 13.55, xend = 19.45, y = 16, yend = 16, color = "#1B5331", linewidth = 2) +
  annotate("segment", x = 19.55, xend = 22.45, y = 16, yend = 16, color = "#C7B19C", linewidth = 2) +
  annotate("segment", x = 22.55, xend = 24.45, y = 16, yend = 16, color = "#00A08A", linewidth = 2) +
  annotate("segment", x = 24.55, xend = 25.45, y = 16, yend = 16, color = "#D69C4E", linewidth = 2) +
  annotate("segment", x = 25.55, xend = 26.45, y = 16, yend = 16, color = "#D8B70A", linewidth = 2) +
  annotate("segment", x = 26.55, xend = 27.45, y = 16, yend = 16, color = "#FAEFD1", linewidth = 2) +
  annotate("segment", x = 27.55, xend = 28.45, y = 16, yend = 16, color = "#046C9A", linewidth = 2) +
  theme(
    legend.position = "bottom",
    legend.key.height = unit(3, "mm"),
    legend.text = element_text(margin = margin(l = 2)),
    legend.margin = margin(t = -10),
    axis.text.x.top = element_text(angle = 90, hjust = 0, vjust = 0.6),
    axis.ticks = element_blank(),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_blank()
  )

Metabolic weight

Expand code
# Two MAGs have no metabolic weight (i.e. are either low abundant or have low metabolic potential)
i <- list(
  genome = c("MEGAHIT-MaxBin2Refined-VK-3516-KFM02a-3_S40_L002.026", "MEGAHIT-MetaBAT2Refined-VK-3516-KFM02a-3_S40_L002.49_sub"),
  pathway = c("Fermentation", "Fermentation"), 
  mw = c(0, 0))

mw <- read_tsv("data/metabolic.tsv.gz", show_col_types = FALSE) %>%
  select(-sample, -mw.function) %>%
  rbind(i) %>%
  group_by(genome, pathway) %>% summarise(mw = sum(mw), .groups = "drop") %>% 
  pivot_wider(names_from = "pathway", values_from = "mw", values_fill = 0) %>%
  pivot_longer(-1, names_to = "pathway", values_to = "mw") %>%
  # Prepare the labels for the plot
  mutate(label = pathway) %>%
  mutate(label = gsub(".arbon", "C", label)) %>%
  mutate(label = gsub(" - ", ": ", label)) %>%
  mutate(label = gsub("amino acid", "AA", label)) %>%
  mutate(label = gsub(" pathway", "", label)) %>%
  mutate(label = gsub("\\|\\|vnfDKG\\|\\|", ", ", label)) %>%
  mutate(label = gsub(" degradation", "", label)) %>%
  mutate(label = gsub("methanol oxidation", "methanol", label)) %>%
  mutate(label = gsub("Iron oxidation:", "Iron oxidation", label)) %>%
  mutate(label = gsub("N2", "N", label))
Expand code
d <- c(
  "Organic C oxidation: complex C",
  "Organic C oxidation: AA utilization",
  "Organic C oxidation: aromatics",
  #"Organic C oxidation: methanol",
  "Organic C oxidation: fatty acid",
  "C fixation: CBB cycle (Rubisco)",
  "C fixation: Reverse TCA cycle",
  "C fixation: 3HP/4HB",
  "C fixation: Wood-Ljungdahl",
  "Acetate oxidation",
  "Fermentation",
  #"Hydrogen generation",
  #"Hydrogen oxidation",
  "N fixation: nifDK, nifH",
  "Sulfur oxidation: sdo",
  "Sulfate reduction",
  "Sulfide oxidation: sqr",
  "Iron oxidation",
  "Iron reduction"
) %>% rev()

cols.phylum <- c(
  "Desulfobacterota" = "#A2A475",
  "Pseudomonadota" = "#972D15",
  "Bacillota" = "#1B5331",
  "Patescibacteria" = "#C7B19C",
  "Nanoarchaeota" = "#00A08A",  
  "Chloroflexota" = "#D69C4E",
  "Spirochaetota" = "#D8B70A",
  "Campylobacterota" = "#FAEFD1",
  "Iainarchaeota" = "#046C9A")
Expand code
name <- bintax %>%
  filter(phylum %in% names(cols.phylum)) %>%
  mutate(phylum = factor(phylum, levels = names(cols.phylum))) %>%
  arrange(phylum) %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  pull(taxon) %>% unique()
Expand code
p2 <- mw %>%
  inner_join(bintax, by = "genome") %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  # Order the genomes
  filter(taxon %in% name) %>%
  mutate(taxon = factor(taxon, name)) %>%
  # Order the pathways
  filter(label %in% d) %>%
  mutate(label = factor(label, d)) %>%
  mutate(score = if_else(mw > 0, "c", "m")) %>%
  # Plot
  ggplot(aes(
    x = taxon, 
    y = label,
    fill = score
    )) +
  geom_tile(color = "white") +
  scale_fill_manual(values = c("c" = "#707151",
                               "m" = "#F0F1EB"),
                    label = c("c" = "Present",
                              "m" = "Absent")) +
  theme_tidy(fontsize = 6) + coord_fixed() +
  geom_vline(xintercept =  7.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 13.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 19.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 22.5, linewidth = 0.6, color = "white") +
  geom_vline(xintercept = 24.5, linewidth = 0.6, color = "white") +
  annotate("segment", x =   0.5, xend =  7.45, y = 17, yend = 17, color = "#A2A475", linewidth = 2) +
  annotate("segment", x =  7.55, xend = 13.45, y = 17, yend = 17, color = "#972D15", linewidth = 2) +
  annotate("segment", x = 13.55, xend = 19.45, y = 17, yend = 17, color = "#1B5331", linewidth = 2) +
  annotate("segment", x = 19.55, xend = 22.45, y = 17, yend = 17, color = "#C7B19C", linewidth = 2) +
  annotate("segment", x = 22.55, xend = 24.45, y = 17, yend = 17, color = "#00A08A", linewidth = 2) +
  annotate("segment", x = 24.55, xend = 25.45, y = 17, yend = 17, color = "#D69C4E", linewidth = 2) +
  annotate("segment", x = 25.55, xend = 26.45, y = 17, yend = 17, color = "#D8B70A", linewidth = 2) +
  annotate("segment", x = 26.55, xend = 27.45, y = 17, yend = 17, color = "#FAEFD1", linewidth = 2) +
  annotate("segment", x = 27.55, xend = 28.45, y = 17, yend = 17, color = "#046C9A", linewidth = 2) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    legend.position = "bottom",
    legend.title = element_blank(),
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    panel.border = element_blank(),
    axis.ticks = element_blank(),
    axis.text.x = element_blank(),
    legend.margin = margin(t = -10),
    legend.key.height = unit(3, "mm")
  )
Expand code
p1 / p2 + plot_annotation(tag_levels = 'a') & theme(
  legend.box.margin = margin(l = -8),
  plot.margin = margin(b = 6),
  legend.key.width = unit(2, "mm"),
  legend.text = element_text(margin = margin(l = 2)),
  plot.tag.location = 'panel',
  plot.tag.position = c(-0.05, 1.03)
  )

Expand code
ggsave("figures/fig-5.pdf", width = 180, height = 180, units = "mm")